Remove “#” in the below code chunk if you want to run it
#install.packages(c("phyloseq", "ggplot2", "microbiome", "dplyr", "plyr",
# "ggpubr", "vegan", "plotly", "ANCOMBC", "tibble",
# "knitr", "viridis", "lmerTest"))
# Cleaning the global environment:
rm(list=ls())
# Loading libraries:
library(phyloseq)
library(ggplot2)
library(microbiome)
library(dplyr)
library(plyr)
library(ggpubr)
library(vegan)
library(plotly)
library(ggpubr)
library(plotly)
library(ANCOMBC)
library(tibble)
library(lmerTest)
library(tidyverse)
library(mixOmics)
theme_set(theme_classic())
# Setting the working directory:
#setwd("~/Documents/Postdoc/Microbiome course Belgrade 2024/Second day")
# Load the cured phyloseq object:
ps <- readRDS("MPS.16S.triads.mod2.RDS")
ps_2 <- prune_samples(ps@sam_data$Sample_Type!="PostFMT", ps)
Important: ps_2 is a curated and normalized phyloseq object which was created in the previous step.
Taxonomy levels:
# Summary of basic information of phyloseq object:
summarize_phyloseq(ps_2)
## [[1]]
## [1] "1] Min. number of reads = 31640"
##
## [[2]]
## [1] "2] Max. number of reads = 44945"
##
## [[3]]
## [1] "3] Total number of reads = 1645521"
##
## [[4]]
## [1] "4] Average number of reads = 37398.2045454545"
##
## [[5]]
## [1] "5] Median number of reads = 37625.5"
##
## [[6]]
## [1] "7] Sparsity = 0.838128020424911"
##
## [[7]]
## [1] "6] Any OTU sum to 1 or less? YES"
##
## [[8]]
## [1] "8] Number of singletons = 374"
##
## [[9]]
## [1] "9] Percent of OTUs that are singletons \n (i.e. exactly one read detected across all samples)3.17619525242394"
##
## [[10]]
## [1] "10] Number of sample variables are: 6"
##
## [[11]]
## [1] "Sample_Type" "Donor_Sample_ID" "Host_Sex" "Subject_ID"
## [5] "Time_Point" "Age"
colnames(tax_table(ps_2))
## [1] "Kingdom" "Phylum" "Class" "Order" "Family" "Genus" "Species"
# How many family/phyla/genera are there in the phylosec object?
table(tax_table(ps_2)[, "Phylum"], exclude = NULL)
##
## Actinobacteria Bacteroidetes Cyanobacteria Epsilonbacteraeota
## 225 545 14 1
## Euryarchaeota Firmicutes Fusobacteria Kiritimatiellaeota
## 8 2057 7 1
## Lentisphaerae Patescibacteria Proteobacteria Synergistetes
## 7 2 80 5
## Tenericutes Verrucomicrobia <NA>
## 21 15 3
sort(table(tax_table(ps_2)[, "Phylum"], exclude = NULL), decreasing = TRUE)
##
## Firmicutes Bacteroidetes Actinobacteria Proteobacteria
## 2057 545 225 80
## Tenericutes Verrucomicrobia Cyanobacteria Euryarchaeota
## 21 15 14 8
## Fusobacteria Lentisphaerae Synergistetes <NA>
## 7 7 5 3
## Patescibacteria Epsilonbacteraeota Kiritimatiellaeota
## 2 1 1
# How many unique family/phyla/genera does this data set have?
length(unique(tax_table(ps_2)[, "Phylum"]))
## [1] 15
# 15 or 14???
# Excluding the Phylum=NA:
ps_3 <- subset_taxa(ps_2, !is.na(Phylum))
sum(is.na(ps_3@tax_table[,"Phylum"])==T)
## [1] 0
table(ps_3@tax_table[,"Phylum"], useNA = "ifany")
##
## Actinobacteria Bacteroidetes Cyanobacteria Epsilonbacteraeota
## 225 545 14 1
## Euryarchaeota Firmicutes Fusobacteria Kiritimatiellaeota
## 8 2057 7 1
## Lentisphaerae Patescibacteria Proteobacteria Synergistetes
## 7 2 80 5
## Tenericutes Verrucomicrobia
## 21 15
prevdf = apply(X = otu_table(ps_3), MARGIN = ifelse(taxa_are_rows(ps_3), yes = 1, no = 2), FUN = function(x){sum(x > 0)})
# Add taxonomy and total read counts to this data.frame
prevdf <- data.frame(Prevalence = prevdf, TotalAbundance = taxa_sums(ps_3), tax_table(ps_3))
plyr::ddply(prevdf, "Phylum", function(df1){cbind(mean(df1$Prevalence),sum(df1$Prevalence))})
## Phylum 1 2
## 1 Actinobacteria 5.9377778 1336
## 2 Bacteroidetes 4.8935780 2667
## 3 Cyanobacteria 2.0000000 28
## 4 Epsilonbacteraeota 0.0000000 0
## 5 Euryarchaeota 4.3750000 35
## 6 Firmicutes 8.1137579 16690
## 7 Fusobacteria 0.8571429 6
## 8 Kiritimatiellaeota 0.0000000 0
## 9 Lentisphaerae 2.0000000 14
## 10 Patescibacteria 3.5000000 7
## 11 Proteobacteria 4.7000000 376
## 12 Synergistetes 1.4000000 7
## 13 Tenericutes 1.7142857 36
## 14 Verrucomicrobia 6.2000000 93
# Subset to the remaining phyla
prevdf1 = subset(prevdf, Phylum %in% get_taxa_unique(ps_3, "Phylum"))
# Visualization:
ggplot(prevdf1, aes(TotalAbundance, Prevalence / nsamples(ps_3),color=Phylum)) +
# Include a guess for parameter
geom_hline(yintercept = 0.05, alpha = 0.5, linetype = 2) +
geom_point(size = 2, alpha = 0.7) + scale_color_viridis_d(option = "plasma") +
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
facet_wrap(~Phylum) + theme(legend.position="none")
# Filtering based on the prevalence:
prevalenceThreshold = 0.05 * nsamples(ps_3) ###### considers taxa present in at last 5% of all samples
prevalenceThreshold
## [1] 2.2
keepTaxa = rownames(prevdf)[(prevdf$Prevalence >= prevalenceThreshold)]
ps_4 = prune_taxa(keepTaxa, ps_3)
# Use phyloseq's built-in functions for filtering:
ps_5 <- filter_taxa(ps_3, function(x) sum(x > 0) >= (0.05 * nsamples(ps_3)), prune = TRUE)
# ps_4 = ps_5, it keeps only taxa that appear in at least 5% of all samples.
ntaxa(ps_3)
## [1] 2988
ntaxa(ps_4)
## [1] 1607
ntaxa(ps_5)
## [1] 1607
# filter based on a certain certain family/phyla/genera:
ps_3_firmicutes <- subset_taxa(ps_3, Phylum == "Firmicutes")
# filter based on certain group/s (males for example)
ps_3_males <- subset_samples(ps_3, Host_Sex == "Male")
plot_bar(ps_4,fill="Phylum") +
theme(legend.position="bottom") +
theme(axis.title.x=element_blank(), axis.text.x=element_blank())
For a better visualization we can normalize the libraries so that every sample has similar (relative) abundances (range between 0 and 1). there are multiple ways to transform data and get relative abundances:
ps_4_RA <- transform_sample_counts(ps_4, function(x) 100 * x/sum(x))
ps_4_RA <- microbiome::transform(ps_4,"compositional")
plot of relative abundances:
plot_bar(ps_4_RA,fill='Phylum') +
theme(legend.position="bottom") +
theme(axis.title.x=element_blank(), axis.text.x=element_blank())
# group by Sample_Type:
plot_bar(ps_4_RA,fill="Phylum") +
theme(legend.position="bottom") +
theme(axis.title.x=element_blank(), axis.text.x=element_blank()) +
facet_grid(~Sample_Type, scales="free_x")
# Average per group:
phylumGlommed = tax_glom(ps_4_RA, "Phylum")
plot_composition(phylumGlommed, average_by = "Sample_Type", transform = "compositional")
# Does the labeling/legend match with with the phyla names?
# to find the related phylum names:
tax_table(phylumGlommed)
## Taxonomy Table: [11 taxa by 7 taxonomic ranks]:
## Kingdom Phylum Class Order Family Genus Species
## ASVX_3 "Bacteria" "Firmicutes" NA NA NA NA NA
## ASVX_846 "Bacteria" "Tenericutes" NA NA NA NA NA
## ASVX_1695 "Bacteria" "Synergistetes" NA NA NA NA NA
## ASVX_112 "Archaea" "Euryarchaeota" NA NA NA NA NA
## ASVX_152 "Bacteria" "Verrucomicrobia" NA NA NA NA NA
## ASVX_1019 "Bacteria" "Lentisphaerae" NA NA NA NA NA
## ASVX_139 "Bacteria" "Proteobacteria" NA NA NA NA NA
## ASVX_69 "Bacteria" "Bacteroidetes" NA NA NA NA NA
## ASVX_2355 "Bacteria" "Patescibacteria" NA NA NA NA NA
## ASVX_273 "Bacteria" "Cyanobacteria" NA NA NA NA NA
## ASVX_20 "Bacteria" "Actinobacteria" NA NA NA NA NA
# changing the labels in the legend:
colnames(phylumGlommed@otu_table) <- phylumGlommed@tax_table[,2]
# plot again:
plot_composition(phylumGlommed, average_by = "Sample_Type", transform = "compositional", legend = phylanames)
Now let’s only look at “Firmicutes” phyla:
# subset the Firmicutes phyla:
ps_3_Firmicutes = subset_taxa(ps_3, Phylum== "Firmicutes")
# plot_abundance function:
plot_abundance = function(physeq,title = "",Facet = "Family", Color = "Family"){
mphyseq = psmelt(physeq)
mphyseq <- subset(mphyseq, Abundance > 0)
ggplot(data = mphyseq, mapping = aes_string(x = "Sample_Type",y = "Abundance",
color = Color, fill = Color)) +
geom_violin(fill = NA) +
geom_point(size = 1, alpha = 0.3,
position = position_jitter(width = 0.3)) +
facet_wrap(facets = Facet) + scale_y_log10()+
theme(legend.position="none")
}
# look at each family in Firmicutes phyla:
plot_abundance(ps_3_Firmicutes,"")
# how if we want to look at a specific family like "Lachnospiraceae":
ps_3_Lachnospiraceae = subset_taxa(ps_2, Family == "Lachnospiraceae")
plot_abundance = function(physeq,title = "",Facet = "Genus", Color = "Genus"){
mphyseq = psmelt(physeq)
mphyseq <- subset(mphyseq, Abundance > 0)
ggplot(data = mphyseq, mapping = aes_string(x = "Sample_Type",y = "Abundance",
color = Color, fill = Color)) + geom_violin(fill = NA) + geom_point(size = 1, alpha = 0.3,position = position_jitter(width = 0.3)) + facet_wrap(facets = Facet) + scale_y_log10()+ theme(legend.position="none") }
plot_abundance(ps_3_Lachnospiraceae,"")
A much debated topic in microbiome data analysis is how to account for uneven sampling depth. Differences in sampling depth introduce a bias which needs to be accounted for. For instance, considering alpha diversity you could expect a strong correlation between observed species and sampling depth. One approach would be to subsample, (rarefy the data) in such a way that all samples would have equal sampling depth. However, some consider this bad practice as data is thrown away. Another issue is the loss of samples when a threshold higher than the actual sampling depth is set. If data is not normalized on the count table, sampling depth needs to be accounted for in the statistical test.
Others however have shown that not subsampling your data still results in bias. In the end one needs to consider the trade off between reporting of a false significant association (Type 1 error) driven by a potential sampling bias, or missing significant associations (Type 2 error) because of reduced statistical power that comes from data sub sampling and sample drop out.
Since statistical analysis are complex enough as it is, we prefer to err on the side of caution and recommend sub sampling the data prior to any analysis unless study design integrity is compromised. Here we proceed with normalization by a single step sub sampling of the data since we believe Type 1 errors are worse than Type 2.
** What would be an appropriate sampling depth for this dataset? **
sample_sums(ps_2) %>% sort()
## 2018_0007_00_SA508SA705 2018_0007_00_SB501SA702 2018_0007_00_SB501SA705
## 31640 32083 32086
## 2018_0007_00_SB505SA706 2018_0007_00_SA502SA708 2018_0007_00_SB504SA708
## 32316 32493 32830
## 2018_0007_00_SA501SA703 2018_0007_00_SB503SA712 2018_0007_00_SB503SA708
## 32839 33623 33834
## 2018_0007_00_SA507SA708 2018_0007_00_SA504SA704 2018_0007_00_SA504SB705
## 33959 34004 34079
## 2018_0007_00_SA505SA705 2018_0007_00_SA505SA711 2018_0007_00_SA506SA711
## 34218 34705 35056
## 2018_0007_00_SB503SA704 2018_0007_00_SA508SB710 2018_0007_00_SA502SA707
## 35385 35598 35732
## 2018_0007_00_SA505SB705 2018_0007_00_SA505SB709 2018_0007_00_SA502SB706
## 35767 37095 37376
## 2018_0007_00_SA508SB712 2018_0007_00_SA506SA707 2018_0007_00_SA504SA711
## 37469 37782 37873
## 2018_0007_00_SB507SA705 2018_0007_00_SA503SA708 2018_0007_00_SA505SA706
## 38589 38919 39060
## 2018_0007_00_SA502SB712 2018_0007_00_SA507SB712 2018_0007_00_SA503SB707
## 39198 39361 39389
## 2018_0007_00_SA502SB708 2018_0007_00_SA503SA705 2018_0007_00_SB504SA711
## 39547 39549 39705
## 2018_0007_00_SA506SB701 2018_0007_00_SA507SA701 2018_0007_00_SA502SB703
## 40278 40384 40760
## 2018_0007_00_SA503SA707 2018_0007_00_SA507SB703 2018_0007_00_SA505SB703
## 40777 40843 41100
## 2018_0007_00_SA507SA709 2018_0007_00_SB508SA702 2018_0007_00_SA507SB702
## 41434 43023 44265
## 2018_0007_00_SA506SB703 2018_0007_00_SA507SB711
## 44553 44945
ps_2 <- rarefy_even_depth(ps_2, sample.size = 20000, rngseed = 211202)
There are different functions from different packages (phyloseq,
vegan, microbiome) to calculate alpha diversity
Here we use microbiome package:
# check if any ASVs/taxa are not present in any of the samples
any(taxa_sums(ps_2)==0)
## [1] FALSE
# prune dataset for taxa that are not present in any of the samples
# ps_2 <- prune_taxa(taxa_sums(ps_2)>0, ps_2)
# check how many ASVs/taxa are kept
ntaxa(ps_2)
## [1] 2610
# check the taxonomic rank information
rank_names(ps_2)
## [1] "Kingdom" "Phylum" "Class" "Order" "Family" "Genus" "Species"
# check samples with low sequencing depth <1000
sample_sums(ps_2) < 1000
## 2018_0007_00_SA501SA703 2018_0007_00_SA502SA707 2018_0007_00_SA502SA708
## FALSE FALSE FALSE
## 2018_0007_00_SA502SB703 2018_0007_00_SA502SB706 2018_0007_00_SA502SB708
## FALSE FALSE FALSE
## 2018_0007_00_SA502SB712 2018_0007_00_SA503SA705 2018_0007_00_SA503SA707
## FALSE FALSE FALSE
## 2018_0007_00_SA503SA708 2018_0007_00_SA503SB707 2018_0007_00_SA504SA704
## FALSE FALSE FALSE
## 2018_0007_00_SA504SA711 2018_0007_00_SA504SB705 2018_0007_00_SA505SA705
## FALSE FALSE FALSE
## 2018_0007_00_SA505SA706 2018_0007_00_SA505SA711 2018_0007_00_SA505SB703
## FALSE FALSE FALSE
## 2018_0007_00_SA505SB705 2018_0007_00_SA505SB709 2018_0007_00_SA506SA707
## FALSE FALSE FALSE
## 2018_0007_00_SA506SA711 2018_0007_00_SA506SB701 2018_0007_00_SA506SB703
## FALSE FALSE FALSE
## 2018_0007_00_SA507SA701 2018_0007_00_SA507SA708 2018_0007_00_SA507SA709
## FALSE FALSE FALSE
## 2018_0007_00_SA507SB702 2018_0007_00_SA507SB703 2018_0007_00_SA507SB711
## FALSE FALSE FALSE
## 2018_0007_00_SA507SB712 2018_0007_00_SA508SA705 2018_0007_00_SA508SB710
## FALSE FALSE FALSE
## 2018_0007_00_SA508SB712 2018_0007_00_SB501SA702 2018_0007_00_SB501SA705
## FALSE FALSE FALSE
## 2018_0007_00_SB503SA704 2018_0007_00_SB503SA708 2018_0007_00_SB503SA712
## FALSE FALSE FALSE
## 2018_0007_00_SB504SA708 2018_0007_00_SB504SA711 2018_0007_00_SB505SA706
## FALSE FALSE FALSE
## 2018_0007_00_SB507SA705 2018_0007_00_SB508SA702
## FALSE FALSE
range(sample_sums(ps_2))
## [1] 20000 20000
# remove samples with low sequencing depth: (we do not have it here)
#ps_2 <- subset_samples(ps_2, sample_sums(ps_2) >1000)
# from microbiome package you can calculate all alpha diversity indexes or specific ones:
#ps_2_alpha <- microbiome::alpha(ps_2, index = "observed")
ps_2_alpha <- microbiome::alpha(ps_2, index = "all")
str(ps_2_alpha)
## 'data.frame': 44 obs. of 22 variables:
## $ observed : num 399 453 393 396 371 418 406 459 321 435 ...
## $ chao1 : num 452 479 419 418 391 ...
## $ diversity_inverse_simpson : num 41 73.7 28.8 32.6 47.1 ...
## $ diversity_gini_simpson : num 0.976 0.986 0.965 0.969 0.979 ...
## $ diversity_shannon : num 4.46 4.92 4.26 4.4 4.52 ...
## $ diversity_fisher : num 70.6 82.4 69.3 70 64.7 ...
## $ diversity_coverage : num 15 27 11 12 17 15 14 21 14 16 ...
## $ evenness_camargo : num 0.689 0.729 0.859 0.714 0.729 ...
## $ evenness_pielou : num 0.744 0.804 0.714 0.736 0.764 ...
## $ evenness_simpson : num 0.1028 0.1628 0.0733 0.0824 0.127 ...
## $ evenness_evar : num 0.223 0.241 0.239 0.253 0.237 ...
## $ evenness_bulla : num 0.336 0.391 0.324 0.357 0.348 ...
## $ dominance_dbp : num 0.0736 0.0387 0.1221 0.092 0.0649 ...
## $ dominance_dmn : num 0.1301 0.0741 0.1881 0.1658 0.1134 ...
## $ dominance_absolute : num 1473 775 2442 1839 1298 ...
## $ dominance_relative : num 0.0736 0.0387 0.1221 0.092 0.0649 ...
## $ dominance_simpson : num 0.0244 0.0136 0.0347 0.0306 0.0212 ...
## $ dominance_core_abundance : num 0.444 0.33 0.233 0.521 0.379 ...
## $ dominance_gini : num 0.971 0.958 0.974 0.969 0.97 ...
## $ rarity_log_modulo_skewness: num 2.06 2.05 2.06 2.06 2.05 ...
## $ rarity_low_abundance : num 0.138 0.179 0.152 0.172 0.156 ...
## $ rarity_rare_abundance : num 0.344 0.361 0.614 0.292 0.313 ...
# extract metadata from phyloseq object (ps_2)
ps_2_meta <- meta(ps_2)
class(ps_2_meta)
## [1] "data.frame"
# merge the alpha diversity indexe/s with metadata (which is now a data frame)
rownames(ps_2_alpha) == rownames(ps_2_meta)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
merged_df <- cbind(ps_2_meta,ps_2_alpha)
#colnames(merged_df)
# Shannon:
p1 <- ggviolin(merged_df, x = "Sample_Type", y = "diversity_shannon",
add = "boxplot", fill = "Sample_Type")
print(p1)
# looking at the distribution of the variable to choose the best test for comparing:
summary(merged_df$diversity_shannon)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.753 4.343 4.529 4.546 4.776 5.131
hist(merged_df$diversity_shannon)
ggqqplot(merged_df$diversity_shannon)
shapiro.test(merged_df$diversity_shannon)
##
## Shapiro-Wilk normality test
##
## data: merged_df$diversity_shannon
## W = 0.9868, p-value = 0.8891
# is diversity_shannon (Shannon index) following a normal distribution or not?
# if normal: parametric tests are recommended
t.test(merged_df$diversity_shannon~Sample_Type,data=merged_df, var.equal = T)
##
## Two Sample t-test
##
## data: merged_df$diversity_shannon by Sample_Type
## t = 1.4618, df = 42, p-value = 0.1512
## alternative hypothesis: true difference in means between group PreFMT and group Donor is not equal to 0
## 95 percent confidence interval:
## -0.05820449 0.36412339
## sample estimates:
## mean in group PreFMT mean in group Donor
## 4.587469 4.434510
var.test(merged_df$diversity_shannon~Sample_Type,data=merged_df)
##
## F test to compare two variances
##
## data: merged_df$diversity_shannon by Sample_Type
## F = 2.393, num df = 31, denom df = 11, p-value = 0.1265
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.7693335 5.8424866
## sample estimates:
## ratio of variances
## 2.392961
# if not normal: non-parametric tests are recommended
wilcox.test(diversity_shannon ~ Sample_Type, data = merged_df, paired = FALSE)
##
## Wilcoxon rank sum exact test
##
## data: diversity_shannon by Sample_Type
## W = 254, p-value = 0.1057
## alternative hypothesis: true location shift is not equal to 0
# Univariable and multivariate alpha-diversity comparisson:
# Unadjusted:
Shannon_model1 <- lm(merged_df$diversity_shannon~merged_df$Sample_Type)
summary(Shannon_model1)
##
## Call:
## lm(formula = merged_df$diversity_shannon ~ merged_df$Sample_Type)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.83481 -0.17242 -0.00933 0.21585 0.54324
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.58747 0.05464 83.951 <2e-16 ***
## merged_df$Sample_TypeDonor -0.15296 0.10464 -1.462 0.151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3091 on 42 degrees of freedom
## Multiple R-squared: 0.04842, Adjusted R-squared: 0.02576
## F-statistic: 2.137 on 1 and 42 DF, p-value: 0.1512
# Adjusted: Let's say we want to adjust for Age ("Age") and sex ("Host_Sex"):
Shannon_model2 <- lm(merged_df$diversity_shannon ~ merged_df$Sample_Type +
merged_df$Host_Sex + merged_df$Age)
summary(Shannon_model2)
##
## Call:
## lm(formula = merged_df$diversity_shannon ~ merged_df$Sample_Type +
## merged_df$Host_Sex + merged_df$Age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.82289 -0.15304 -0.00487 0.22079 0.54041
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.84869 0.64745 5.944 5.66e-07 ***
## merged_df$Sample_TypeDonor 0.06438 0.23418 0.275 0.785
## merged_df$Host_SexMale 0.10744 0.09801 1.096 0.280
## merged_df$Age 0.01475 0.01416 1.042 0.304
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3081 on 40 degrees of freedom
## Multiple R-squared: 0.09949, Adjusted R-squared: 0.03195
## F-statistic: 1.473 on 3 and 40 DF, p-value: 0.2364
# Richness (Observed taxa)
p2 <- ggviolin(merged_df, x = "Sample_Type", y = "observed",
add = "boxplot", fill = "Sample_Type")
print(p2)
# looking at the distribution of the variable to choose the best test for comparing:
summary(merged_df$observed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 270.0 396.0 438.0 438.8 471.0 580.0
hist(merged_df$observed)
ggqqplot(merged_df$observed)
shapiro.test(merged_df$observed)
##
## Shapiro-Wilk normality test
##
## data: merged_df$observed
## W = 0.97546, p-value = 0.4635
# is Richness (Observed taxa) following a normal distribution or not?
# if normal: parametric tests are recommended
t.test(merged_df$observed~Sample_Type,data=merged_df, var.equal = T)
##
## Two Sample t-test
##
## data: merged_df$observed by Sample_Type
## t = 0.94214, df = 42, p-value = 0.3515
## alternative hypothesis: true difference in means between group PreFMT and group Donor is not equal to 0
## 95 percent confidence interval:
## -24.18459 66.53875
## sample estimates:
## mean in group PreFMT mean in group Donor
## 444.5938 423.4167
var.test(merged_df$observed~Sample_Type,data=merged_df)
##
## F test to compare two variances
##
## data: merged_df$observed by Sample_Type
## F = 2.0086, num df = 31, denom df = 11, p-value = 0.2201
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.6457505 4.9039701
## sample estimates:
## ratio of variances
## 2.008564
# if not normal: non-parametric tests are recommended
wilcox.test(observed ~ Sample_Type, data = merged_df, paired = FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: observed by Sample_Type
## W = 233.5, p-value = 0.2798
## alternative hypothesis: true location shift is not equal to 0
# Univariable and multivariate alpha-diversity comparisson:
# Unadjusted:
Richness_model1 <- lm(merged_df$observed~merged_df$Sample_Type)
summary(Shannon_model1)
##
## Call:
## lm(formula = merged_df$diversity_shannon ~ merged_df$Sample_Type)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.83481 -0.17242 -0.00933 0.21585 0.54324
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.58747 0.05464 83.951 <2e-16 ***
## merged_df$Sample_TypeDonor -0.15296 0.10464 -1.462 0.151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3091 on 42 degrees of freedom
## Multiple R-squared: 0.04842, Adjusted R-squared: 0.02576
## F-statistic: 2.137 on 1 and 42 DF, p-value: 0.1512
# Adjusted: Let's say we want to adjust for Age ("Age") and sex ("Host_Sex"):
Richness_model2 <- lm(merged_df$observed ~ merged_df$Sample_Type +
merged_df$Age + merged_df$Host_Sex)
summary(Shannon_model2)
##
## Call:
## lm(formula = merged_df$diversity_shannon ~ merged_df$Sample_Type +
## merged_df$Host_Sex + merged_df$Age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.82289 -0.15304 -0.00487 0.22079 0.54041
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.84869 0.64745 5.944 5.66e-07 ***
## merged_df$Sample_TypeDonor 0.06438 0.23418 0.275 0.785
## merged_df$Host_SexMale 0.10744 0.09801 1.096 0.280
## merged_df$Age 0.01475 0.01416 1.042 0.304
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3081 on 40 degrees of freedom
## Multiple R-squared: 0.09949, Adjusted R-squared: 0.03195
## F-statistic: 1.473 on 3 and 40 DF, p-value: 0.2364
So based on the results there is no significant difference between the two groups (Rd increased versus decreased) in alpha diversity using Richness and Shannon index
**Note: Many papers are also using Chao1 index to determine richness levels. However, some of us consider this metric inappropriate for (amplicon) sequence data as it tends to extrapolate sequencing error rather than true richness.
Beta diversity describes the variation in species composition between samples, often expressed as the “distance” or “dissimilarity” between them. The choice of distance metric depends on the context and goals of the analysis, as different metrics capture different aspects of community composition.
For example, Bray-Curtis and Jaccard distances treat each species or feature as a distinct unit, without considering relationships between species. These metrics only focus on shared presence or abundance between samples. In contrast, UniFrac distances account for the evolutionary relationships (phylogeny) between species. With UniFrac, samples with phylogenetically distinct species show higher distances than samples with species that are different but closely related. This makes UniFrac especially valuable when genetic or evolutionary relatedness is meaningful in the analysis.
Another key distinction is whether the distance metric considers feature abundance. Some methods are weighted (e.g., weighted UniFrac and Bray-Curtis), which incorporate the relative abundance of species in the calculation. Others are unweighted (e.g., Jaccard and unweighted UniFrac), which treat species as simply present or absent. Weighted metrics capture differences in both composition and abundance, while unweighted metrics focus solely on the presence or absence of species, providing a simpler, binary view of beta diversity.
Most common distance metrics used are:
bray: Bray-Curtis
unifrac: unweighted UniFrac distance
wunifrac: weighted-UniFrac distance
dpcoa: sample-wise distance used in Double Principle Coordinate Analysis
jsd: Jensen-Shannon Divergence
The obtained distance matrices are used both for visualization and statistical testing.
In a static plot on a screen, we are typically limited to only two dimensions (x and y), which constrains our ability to visualize high-dimensional data. Ordination is a set of methods designed to reduce the dimensionality of complex datasets for the purpose of visualization, simplification, and interpretation. By reducing data from multiple dimensions to just two or three, ordination enables researchers to visualize and interpret underlying patterns, trends, or gradients that would otherwise be difficult to discern.
Ordination seeks to simplify the data by finding new dimensions, often called axes, components, or factors, that capture the greatest amount of variance (i.e., variation or information) in the dataset. The first axis is chosen to capture as much variance as possible in the data, and subsequent axes capture the remaining variance, each being orthogonal (or perpendicular) to the previous ones. This orthogonality ensures that each axis is independent and does not overlap in the information it represents. Through this process, high-dimensional relationships are projected into a lower-dimensional space in a way that retains as much of the original data’s structure as possible.
To achieve maximum variance on each axis, ordination techniques often involve mathematical transformations that “rotate” the data in its original multidimensional space. This rotation aligns the data along new axes that maximize distinct sources of variation, making it easier to identify meaningful ecological gradients or patterns.
A simple example of ordination is a 3d plot. While 3 dimensions can be viewed, the actual image is a projection in 2D.
p <- plot_ly(data = ps_2@otu_table@.Data %>% data.frame(),
x = ~ASVX_1, y = ~ASVX_2, z = ~ASVX_3,
color = ~ASVX_1,
type = "scatter3d",
mode = "markers")
p
For performing the ordination we use ordinate() function in phyloseq which does both distance calculation and the ordination: Different ordination Constrained and unconstrained methods are available.
1- method (Optional). A character string. Default is “DCA”.
Currently supported method options are: c(“DCA”, “CCA”, “RDA”, “CAP”, “DPCoA”, “NMDS”, “MDS”, “PCoA”)
DCA: Performs detrended correspondence analysis usingdecorana
CCA: Performs correspondence analysis, or optionally, constrained correspondence analysis (a.k.a. canonical correspondence analysis), via cca
RDA: Performs redundancy analysis, if no contrainsts are given a principal components analysis is performed, via rda
CAP: [Partial] Constrained Analysis of Principal Coordinates or distance-based RDA, via capscale. See capscale.phyloseq for more details. In particular, a formula argument must be provided.
DPCoA: Performs Double Principle Coordinate Analysis using a (corrected, if necessary) phylogenetic/patristic distance between species. The calculation is performed by DPCoA(), which ultimately uses dpcoa after making the appropriate accessions/corrections of the data.
NMDS: Performs Non-metric MultiDimenstional Scaling of a sample-wise ecological distance matrix onto a user-specified number of axes, k. By default, k=2, but this can be modified as a supplementary argument. This method is ultimately carried out by metaMDS after the appropriate accessions and distance calculations. Because metaMDS includes its own distance calculation wrappers to vegdist, and these provide additional functionality in the form of species scores, ordinate will pass-on the distance argument to metaMDS if it is among the supported vegdist methods. However, all distance methods supported by distance are supported here, including “unifrac” (the default) and “DPCoA”.
MDS/PCoA: Performs principal coordinate analysis (also called principle coordinate decomposition, multidimensional scaling (MDS), or classical scaling) of a distance matrix (Gower 1966), including two correction methods for negative eigenvalues. See pcoa for further details.
Lets visualize the betadiversity using the most common distance metric Bray-Curtis and PCoA (MDS) ordination.
# generate distance metrics: (MDS (“PCoA”) on Bray-Curtis)
ord_ps_2_bray <- ordinate(ps_2, "MDS", "bray")
# generate the plot
p1_bray <- plot_ordination(ps_2, ord_ps_2_bray, type="samples",
color="Sample_Type", title="Samples",
axes = 1:2, shape = NULL) +
geom_point(size=3)
print(p1_bray)
## generate a 3D plot:
# Extract ordination scores
ord_scores_bray <- as.data.frame(ord_ps_2_bray$vectors)
# Add metadata to the scores
ord_scores_bray$Sample_Type <- sample_data(ps_2)$Sample_Type
# collect only the first 3 Axis and variable of interest (Sample_Type):
ord_scores_bray_3d <- ord_scores_bray[, c("Axis.1", "Axis.2", "Axis.3", "Sample_Type")]
colnames(ord_scores_bray_3d)[1:3] <- c("PCoA1", "PCoA2", "PCoA3")
p <- plot_ly(data = ord_scores_bray_3d,
x = ~PCoA1, y = ~PCoA2, z = ~PCoA3,
color = ~Sample_Type,
type = "scatter3d",
mode = "markers")
p <- p %>% layout(scene = list(xaxis = list(title = 'PCoA1'),
yaxis = list(title = 'PCoA2'),
zaxis = list(title = 'PCoA3')),
title = "3D Bray-Curtis Ordination")
p
# calculate and generate multiple plots using one distance metrics (bray-curtis) and
# multiple methods:
dist = "bray"
ord_meths = c("DCA", "CCA", "RDA", "MDS")
plist = llply(as.list(ord_meths), function(i, physeq, dist){
ordi = ordinate(physeq, method=i, distance=dist)
plot_ordination(physeq, ordi, type="samples", color="Sample_Type")
}, ps_2, dist)
names(plist) <- ord_meths
pdataframe = ldply(plist, function(x){
df = x$data[, 1:2]
colnames(df) = c("Axis_1", "Axis_2")
return(cbind(df, x$data))
})
names(pdataframe)[1] = "method"
p = ggplot(pdataframe, aes(Axis_1, Axis_2, color=Sample_Type))
p = p + geom_point(size=3)
p = p + facet_wrap(~method, scales="free")
p
# look at each one:
plist[[4]]
################################################################################
# MDS (“PCoA”) on weighted Unifrac Distances:
ord_ps_2_WUni <- ordinate(ps_2, "PCoA", "unifrac", weighted=TRUE)
plot_ordination(ps_2, ord_ps_2_WUni, color="Sample_Type", shape=NULL)
# biplots for both samples and taxas:
plot_ordination(ps_2, ord_ps_2_WUni, type="biplot", color="Phylum", shape="Sample_Type")
# splited plots for samples and taxa:
plot_ordination(ps_2, ord_ps_2_WUni, type="split", color="Phylum", shape="Sample_Type")
It is bad practice to include the outcome in a constrained ordination. Due to the high dimensionality of the data there is usually some hyperplane that can seperate groups very well, even though no significant difference exists.
ord <- ordinate(ps_2, method = "RDA", formula = ~Sample_Type, distance = "bray")
plot_ordination(physeq = ps_2, ordination = ord, color="Sample_Type")
However, if we shuffle our parameter we can still obtain good seperation even though the association is now complete random.
ps_2@sam_data$Sample_Type_shuffle <- sample(ps_2@sam_data$Sample_Type)
ord <- ordinate(ps_2, method = "RDA", formula = ~Sample_Type_shuffle, distance = "bray")
plot_ordination(physeq = ps_2, ordination = ord, color="Sample_Type_shuffle")
PERMANOVA uses distance matrices calculated from beta diversity measures (e.g., Bray-Curtis, Jaccard, UniFrac) as input. PERMANOVA can incorporate multiple explanatory variables and their interactions. So confounders can be considered. PERMANOVA is a non-parametric test, meaning it doesn’t assume normal distribution of the data.
# PERMANOVA using Bray-Curtis (method="bray") distance metrics
permanova1 <- adonis2(ps_2@otu_table ~ ps_2@sam_data [["Sample_Type"]],
permutations=999, method = "bray")
permanova1
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = ps_2@otu_table ~ ps_2@sam_data[["Sample_Type"]], permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## ps_2@sam_data[["Sample_Type"]] 1 0.3164 0.03269 1.4195 0.038 *
## Residual 42 9.3607 0.96731
## Total 43 9.6771 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#permanova1_adj <- adonis2(ps_2@otu_table ~ ps_2@sam_data [["Sample_Type"]] +
# ps_2@sam_data [["Age"]] +
# ps_2@sam_data [["Host_Sex"]], permutations=999,
# method = "bray")
permanova1_adj <- adonis2(ps_2@otu_table ~ ps_2@sam_data [["Age"]] +
ps_2@sam_data [["Host_Sex"]] +
ps_2@sam_data [["Sample_Type"]], permutations=999,
method = "bray")
permanova1_adj
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = ps_2@otu_table ~ ps_2@sam_data[["Age"]] + ps_2@sam_data[["Host_Sex"]] + ps_2@sam_data[["Sample_Type"]], permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## ps_2@sam_data[["Age"]] 1 0.3268 0.03377 1.4655 0.024 *
## ps_2@sam_data[["Host_Sex"]] 1 0.2448 0.02530 1.0979 0.267
## ps_2@sam_data[["Sample_Type"]] 1 0.1850 0.01912 0.8296 0.816
## Residual 40 8.9204 0.92181
## Total 43 9.6771 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Attention: the order of covariates matter because the covariates’ order can change the proportion of the variation that a variable can have on the microbiome composition.
It is recommended to put your variable of interest at the end of the formula. This way variance that is associated with confounders is attributed to these rather the variable of interest.
Determining differential abundance of microbes might seem trivial, but results show that outcomes of these analysis can be very stochastic and spurious depending on the methods used r Langille Lahti.
Analysis of Compositions of Microbiomes with Bias Correction (ANCOM-BC2)
ANCOMBC2() from ANCOMBC package can run on different Taxonomy levels (Phylum, Family, Genus, etc)
ANCOM-BC can handle multiple covariates and multiple test correction. Hence it is possible to add confounders as covariates in the formula.
# Phylum level
ancom1 = ancombc(data = ps_2, assay_name = "counts",
tax_level = "Phylum", phyloseq = NULL,
formula = "Age + Host_Sex + Sample_Type",
p_adj_method = "BH", prv_cut = 0.10, lib_cut = 1000,
group = "Sample_Type", struc_zero = TRUE, neg_lb = TRUE, tol = 1e-5,
max_iter = 100, conserve = TRUE, alpha = 0.05, global = TRUE,
n_cl = 1, verbose = TRUE)
#ancom1[["res"]]
# is there any phylum differ between the groups?
ancom1[["res"]]$diff_abn$taxon [ancom1[["res"]]$diff_abn$Sample_Type==TRUE]
## [1] "Phylum:Verrucomicrobia" "Phylum:Lentisphaerae" "Phylum:Patescibacteria"
# Family level
ancom2 = ancombc(data = ps_2, assay_name = "counts",
tax_level = "Family", phyloseq = NULL,
formula = "Age + Host_Sex + Sample_Type",
p_adj_method = "BH", prv_cut = 0.10, lib_cut = 1000,
group = "Sample_Type", struc_zero = TRUE, neg_lb = TRUE, tol = 1e-5,
max_iter = 100, conserve = TRUE, alpha = 0.05, global = TRUE,
n_cl = 1, verbose = TRUE)
#ancom2[["res"]]
# is there any family differ between the groups?
ancom2[["res"]]$diff_abn$taxon [ancom2[["res"]]$diff_abn$Sample_Type==TRUE]
## [1] "Family:Clostridiales_vadinBB60_group"
## [2] "Family:Family_XI"
## [3] "Order:Izimaplasmatales"
## [4] "Order:Mollicutes_RF39"
## [5] "Family:Succinivibrionaceae"
## [6] "Order:Bacteroidales"
## [7] "Family:Saccharimonadaceae"
# generating relative abundances:
ps_2_RA <- transform_sample_counts(ps_2, function(x) x/sum(x) * 100)
# Relative abundance Enterococcaceae family:
ps_2_Streptococcaceae = subset_taxa(ps_2_RA, Family == "Streptococcaceae")
plot_abundance = function(physeq,title = "",Facet = "Family", Color = "Family"){
mphyseq = psmelt(physeq)
mphyseq <- subset(mphyseq, Abundance > -1)
ggplot(data = mphyseq, mapping = aes_string(x = "Sample_Type",y = "Abundance",
color = Color, fill = Color)) +
geom_violin(fill = NA) +
geom_boxplot(fill=NA, width = 0.1) +
geom_point(size = 5, alpha = 0.3,
position = position_jitter(width = 0.3)) +
facet_wrap(facets = Facet) +
scale_y_log10()+
theme(legend.position="none")
}
plot_abundance(ps_2_Streptococcaceae,"")
# Genus level
ancom3 = ancombc(data = ps_2, assay_name = "counts",
tax_level = "Genus", phyloseq = NULL,
formula = "Age + Host_Sex + Sample_Type",
p_adj_method = "BH", prv_cut = 0.10, lib_cut = 1000,
group = "Sample_Type", struc_zero = TRUE, neg_lb = TRUE, tol = 1e-5,
max_iter = 100, conserve = TRUE, alpha = 0.05, global = TRUE,
n_cl = 1, verbose = TRUE)
#ancom3[["res"]]
# is there any genus differ between the groups?
ancom3[["res"]]$diff_abn$taxon [ancom3[["res"]]$diff_abn$Sample_Type==TRUE]
## [1] "Genus:Shuttleworthia"
## [2] "Genus:Sellimonas"
## [3] "Genus:Fournierella"
## [4] "Genus:Ruminococcaceae_UCG-008"
## [5] "Genus:Ruminococcaceae_V9D2013_group"
## [6] "Genus:Ruminococcaceae_UCG-007"
## [7] "Genus:Ruminiclostridium"
## [8] "Family:Clostridiales_vadinBB60_group"
## [9] "Genus:Mitsuokella"
## [10] "Genus:Veillonella"
## [11] "Genus:Anaerofustis"
## [12] "Genus:Parvimonas"
## [13] "Order:Izimaplasmatales"
## [14] "Order:Mollicutes_RF39"
## [15] "Family:Erysipelotrichaceae"
## [16] "Genus:Holdemania"
## [17] "Genus:Faecalitalea"
## [18] "Genus:Merdibacter"
## [19] "Genus:Methanosphaera"
## [20] "Genus:Succinivibrio"
## [21] "Genus:Enterobacter"
## [22] "Order:Bacteroidales"
## [23] "Genus:Prevotellaceae_UCG-001"
## [24] "Genus:Prevotella_2"
## [25] "Genus:Prevotellaceae_NK3B31_group"
## [26] "Family:Barnesiellaceae"
## [27] "Family:Saccharimonadaceae"
## [28] "Genus:F0332"
## [29] "Genus:Libanicoccus"
## [30] "Genus:Coriobacteriaceae_UCG-003"
## [31] "Genus:Lachnospiraceae_NK4B4_group"
## [32] "Genus:Lachnospiraceae_UCG-008"
## [33] "Genus:Lactonifactor"
## [34] "Genus:UC5-1-2E3"
## [35] "Genus:Hungatella"
## [36] "Genus:Butyrivibrio"
## [37] "Genus:Lachnospiraceae_UCG-003"
# Relative abundance Enterococcus genus:
ps_2_Mitsuokella = subset_taxa(ps_2_RA, Genus == "Mitsuokella")
plot_abundance = function(physeq,title = "",Facet = "Genus", Color = "Genus"){
mphyseq = psmelt(physeq)
mphyseq <- subset(mphyseq, Abundance > -1)
ggplot(data = mphyseq, mapping = aes_string(x = "Sample_Type",y = "Abundance",
color = Color, fill = Color)) +
geom_violin(fill = NA) +
geom_boxplot(fill=NA, width = 0.1) +
geom_point(size = 5, alpha = 0.3,
position = position_jitter(width = 0.3)) +
facet_wrap(facets = Facet) + scale_y_log10()+
theme(legend.position="none")
}
plot_abundance(ps_2_Mitsuokella,"")
# Relative abundance Enterococcus genus:
ps_2_Atopobium = subset_taxa(ps_2_RA, Genus == "Shuttleworthia")
plot_abundance(ps_2_Atopobium,"")
# Relative abundance Enterococcus genus:
ps_2_Tyzzerella_4 = subset_taxa(ps_2_RA, Genus == "Sellimonas")
plot_abundance(ps_2_Tyzzerella_4,"")
# Relative abundance Enterococcus genus:
ps_2_Peptostreptococcus = subset_taxa(ps_2_RA, Genus == "Fournierella")
plot_abundance(ps_2_Peptostreptococcus,"")
# Relative abundance Enterococcus genus:
ps_2_Lactonifactor = subset_taxa(ps_2_RA, Genus == "Ruminococcaceae_UCG-008")
plot_abundance(ps_2_Lactonifactor,"")
When dealing with more complex designs we need to account for nesting of samples. In this dataset when we want to compare the changes over time we need to account for the pairing of samples from each of the subjects. For univariate testing you could use paired tests like t.test or wilcoxon. However to allow for more complex designs we will use linear mixed models.
ps_paired <- prune_samples(ps@sam_data$Sample_Type!="Donor", ps)
ps_paired <- rarefy_even_depth(ps_paired)
df.ad <- cbind(estimate_richness(ps_paired, measures = c("Observed","Shannon")), ps_paired@sam_data %>% data.frame())
df.ad$FPD <- picante::pd(samp = as.matrix(unclass(otu_table(ps_paired))), tree = phy_tree(ps_paired), include.root = F)[gsub("X","",rownames(df.ad)),"PD"]
df.ad.long <- df.ad %>% pivot_longer(cols = c("Shannon","Observed","FPD"), names_to = "Diversity_metric", values_to = "Diversity")
diversity_names <- c(
`Observed` = "Observed",
`Shannon` = "Shannon",
`FPD` = "Phylogenetic Diversity")
p.adiv.f20 <- ggplot(df.ad.long, aes(x=Time_Point, y=Diversity, group=Sample_Type, color=Sample_Type)) +
#geom_boxplot(outlier.colour = NA, aes(group=Time_Point)) +
ggbeeswarm::geom_beeswarm(size=2, cex = 1,priority = "random") +
#facet_grid(variable~Host_Body_Site, scale="free_y", labeller = as_labeller(diversity_names)) +
facet_grid(Diversity_metric~., scale="free_y") +
scale_colour_viridis_d(option = "B", begin = 0.25, end = 0.75) +
#scale_fill_viridis_d(option = "B", begin = 0.25, end = 0.75, alpha = 1) +
geom_line(aes(group=Subject_ID), alpha=0.2) +
labs(x=NULL,y=NULL) +
theme_bw() +
geom_smooth(method = "lm") +
stat_cor() +
#geom_line(aes(group=Subject_ID), color="black") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
#stat_compare_means() +
NULL; plot(p.adiv.f20)
summary(lmer(Shannon~Sample_Type + (1 | Subject_ID), df.ad))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Shannon ~ Sample_Type + (1 | Subject_ID)
## Data: df.ad
##
## REML criterion at convergence: 28.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.87855 -0.55200 0.02058 0.63760 1.75162
##
## Random effects:
## Groups Name Variance Std.Dev.
## Subject_ID (Intercept) 0.05543 0.2354
## Residual 0.04457 0.2111
## Number of obs: 64, groups: Subject_ID, 32
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 4.59445 0.05590 47.42897 82.189 <2e-16 ***
## Sample_TypePostFMT -0.07907 0.05278 31.00000 -1.498 0.144
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Smpl_TyPFMT -0.472
summary(lmer(Observed~Sample_Type + (1 | Subject_ID), df.ad))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Observed ~ Sample_Type + (1 | Subject_ID)
## Data: df.ad
##
## REML criterion at convergence: 712.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.83428 -0.51014 -0.04003 0.53135 2.01376
##
## Random effects:
## Groups Name Variance Std.Dev.
## Subject_ID (Intercept) 4435 66.59
## Residual 2317 48.14
## Number of obs: 64, groups: Subject_ID, 32
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 464.94 14.53 43.31 32.008 < 2e-16 ***
## Sample_TypePostFMT 66.66 12.03 31.00 5.539 4.58e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Smpl_TyPFMT -0.414
summary(lmer(FPD~Sample_Type + (1 | Subject_ID), df.ad))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: FPD ~ Sample_Type + (1 | Subject_ID)
## Data: df.ad
##
## REML criterion at convergence: 419.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.5162 -0.4420 -0.1168 0.3515 2.0492
##
## Random effects:
## Groups Name Variance Std.Dev.
## Subject_ID (Intercept) 35.89 5.991
## Residual 22.05 4.696
## Number of obs: 64, groups: Subject_ID, 32
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 42.745 1.346 44.809 31.766 < 2e-16 ***
## Sample_TypePostFMT 4.624 1.174 31.000 3.939 0.000433 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Smpl_TyPFMT -0.436
We can account for the inter individual variance by what is called multilevel PCA. This allows us to only focus on the variance over the two time points.
diverse.pca = pca(ps_paired@otu_table+1, ncomp = 10, logratio = 'CLR', multilevel = ps_paired@sam_data$Subject_ID)
plot(diverse.pca)
time="Time_Point"
var="Sample_Type"
x="PC1" # first component
y="PC2" # second component
#build df
variates <- diverse.pca$variates$X
df <- data.frame(row.names = rownames(variates),
variates,
ps_paired@sam_data[rownames(variates),])
# include centroids
df$PTV_TP <- paste0(df[,var],df[,time])
centroids <- aggregate(variates~PTV_TP,data=df,mean)
df <- merge(df,centroids,by="PTV_TP",suffixes=c("",".centroid"))
# plot
p.multi.pca <- ggplot(df, aes_string(x=x, y=y, color=var, shape=time)) +
geom_segment(aes_string(x=paste0(x,".centroid"), y=paste0(y,".centroid"), xend=x, yend=y), alpha=0.3, color="black") +
# scale_color_viridis_d(option = "C", end = 0.75) +
scale_color_manual(values = RColorBrewer::brewer.pal(n = 5, name = "Set1")) +
theme_bw() +
labs(x=paste0(x,": ",round(diverse.pca$prop_expl_var$X[x], 3)*100, "% variance explained"),
y=paste0(y,": ",round(diverse.pca$prop_expl_var$X[y], 3)*100, "% variance explained"),
title="Multilevel PCA") +
#facet_wrap(~Time_Point) +
geom_point(size=3) +
ggrepel::geom_text_repel(aes(label=Subject_ID)) +
NULL; plot(p.multi.pca)
df <- data.frame(row.names = rownames(variates),
variates,
ps_paired@sam_data[rownames(variates),])
summary(manova(variates[,1:2]~df$Time_Point*df$Sample_Type))
## Df Pillai approx F num Df den Df Pr(>F)
## df$Time_Point 1 0.58392 42.804 2 61 2.425e-12 ***
## Residuals 62
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(variates~df$Time_Point*df$Sample_Type))
## Response PC1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point 1 2125.5 2125.51 60.081 1.07e-10 ***
## Residuals 62 2193.4 35.38
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response PC2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point 1 195.22 195.225 6.2657 0.01496 *
## Residuals 62 1931.79 31.158
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response PC3 :
## Df Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point 1 48.65 48.647 1.5516 0.2176
## Residuals 62 1943.86 31.353
##
## Response PC4 :
## Df Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point 1 1.05 1.0504 0.0402 0.8418
## Residuals 62 1621.69 26.1563
##
## Response PC5 :
## Df Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point 1 0.01 0.0085 4e-04 0.985
## Residuals 62 1477.82 23.8358
##
## Response PC6 :
## Df Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point 1 9.78 9.7752 0.4539 0.503
## Residuals 62 1335.13 21.5343
##
## Response PC7 :
## Df Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point 1 52.61 52.607 2.8087 0.09879 .
## Residuals 62 1161.29 18.730
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response PC8 :
## Df Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point 1 30.32 30.321 1.6368 0.2055
## Residuals 62 1148.51 18.524
##
## Response PC9 :
## Df Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point 1 19.76 19.756 1.14 0.2898
## Residuals 62 1074.48 17.330
##
## Response PC10 :
## Df Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point 1 24.34 24.344 1.4379 0.235
## Residuals 62 1049.68 16.930
We also can account for it in the permanova by using the stata argument
d <- phyloseq::distance(ps_paired, "bray")
adonis2(d ~ ps_paired@sam_data$Sample_Type, strata = ps_paired@sam_data$Subject_ID)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ ps_paired@sam_data$Sample_Type, strata = ps_paired@sam_data$Subject_ID)
## Df SumOfSqs R2 F Pr(>F)
## ps_paired@sam_data$Sample_Type 1 0.2355 0.01763 1.1126 0.001 ***
## Residual 62 13.1212 0.98237
## Total 63 13.3567 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We again can use lmer to test differences in taxa abundance. Note due to multiple testing burden it is recommended to reduced the set of taxa that get tested. Here we will only test the top 100 most abundant taxa.
lmerres <- function(asv, ps){
psm <- psmelt(prune_taxa(asv, ps))
lmerTest::lmer(Abundance ~ Sample_Type + (1|Subject_ID), psm)
}
taxa <- taxa_sums(ps_paired) %>% sort(decreasing = T) %>% head(250) %>% names()
asv_lmermodels <- lapply(taxa, function(x) lmerres(x, ps_paired))
df <- data.frame(taxa=taxa,
pvals = unlist(lapply(asv_lmermodels, function(x) coefficients(summary(x))[2,5])),
ps_paired@tax_table[taxa,]
)
df$padj <- p.adjust(df$pvals, method = "fdr")
df[df$padj<0.05,]
## taxa pvals Kingdom Phylum Class Order
## ASVX_7703 ASVX_7703 3.239364e-06 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_568 ASVX_568 6.912786e-04 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_1335 ASVX_1335 4.047797e-05 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_4346 ASVX_4346 3.929851e-05 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_4915 ASVX_4915 1.244758e-03 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_2278 ASVX_2278 6.617822e-04 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_6835 ASVX_6835 2.810065e-04 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_4239 ASVX_4239 4.482078e-05 Bacteria Firmicutes Clostridia Clostridiales
## Family Genus Species padj
## ASVX_7703 Lachnospiraceae Dorea <NA> 0.0008098409
## ASVX_568 Lachnospiraceae Agathobacter <NA> 0.0246885225
## ASVX_1335 Lachnospiraceae Blautia <NA> 0.0028012987
## ASVX_4346 Lachnospiraceae Agathobacter <NA> 0.0028012987
## ASVX_4915 Lachnospiraceae Lachnospiraceae_ND3007_group <NA> 0.0388986868
## ASVX_2278 Lachnospiraceae Dorea <NA> 0.0246885225
## ASVX_6835 Lachnospiraceae Blautia <NA> 0.0140503265
## ASVX_4239 Lachnospiraceae Roseburia <NA> 0.0028012987
asv <- df[df$padj<0.05,] %>% rownames()
psm <- psmelt(prune_taxa(asv, ps_paired))
ggplot(psm, aes(x=Sample_Type, y=Abundance)) +
ggbeeswarm::geom_beeswarm() +
geom_boxplot() +
facet_wrap(~OTU+Genus, scales="free")
To test if there is any engraftment of species we can test if the distance of the patients is reduced after FMT.
ps_rare <- rarefy_even_depth(ps)
d <- distance(ps_rare, "jaccard", binary=T)
vegan::adonis2(d~ps_rare@sam_data$Sample_Type)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## vegan::adonis2(formula = d ~ ps_rare@sam_data$Sample_Type)
## Df SumOfSqs R2 F Pr(>F)
## ps_rare@sam_data$Sample_Type 2 0.7578 0.04058 1.5439 0.001 ***
## Residual 73 17.9161 0.95942
## Total 75 18.6740 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- reshape2::melt(as.matrix(d)) # we convert the matrix to long format (each index becomes a row)
df$Var1_Donor_ID <- ps_rare@sam_data[df$Var1,]$Donor_Sample_ID
df$Var2_Donor_ID <- ps_rare@sam_data[df$Var2,]$Donor_Sample_ID
df$Var1_Sample_Type <- ps_rare@sam_data[df$Var1,]$Sample_Type
df$Var2_Sample_Type <- ps_rare@sam_data[df$Var2,]$Sample_Type
df <- df[df$Var1_Sample_Type!="Donor",] # We only want to compare distance between recipient/donors
df <- df[df$Var2_Sample_Type=="Donor",]
df$Donor_match <- df$Var1_Donor_ID==df$Var2_Donor_ID # check if the comparison is within or between
ggplot(df, aes(x=Donor_match, y=value)) +
facet_wrap(~Var1_Sample_Type) +
geom_jitter() +
stat_compare_means() +
NULL